We want to look at real datasets to simulate realistic datasets.

data <- read.table("expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')

level1 <- as.factor(as.matrix(data)[9,-(1:2)])
tissue <- as.factor(as.matrix(data)[1,-(1:2)])
table(tissue)
## tissue
## ca1hippocampus       sscortex 
##           1314           1691
group <- as.factor(as.matrix(data)[2,-(1:2)])
table(tissue, group)
##                 group
## tissue             1   2   3   4   5   6   7   8   9
##   ca1hippocampus 126  20 919 121  14  18  64  17  15
##   sscortex       164 370  29 699  84 157 134   9  45
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
table(tissue, sex)
##                 sex
## tissue            -1   0   1
##   ca1hippocampus 748  46 520
##   sscortex       776   0 915
level2 <- as.factor(as.matrix(data)[10,-(1:2)])
table(level1)
## level1
## astrocytes_ependymal    endothelial-mural         interneurons 
##                  224                  235                  290 
##            microglia     oligodendrocytes        pyramidal CA1 
##                   98                  820                  939 
##         pyramidal SS 
##                  399
table(level1, level2)
##                       level2
## level1                 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
##   astrocytes_ependymal     65     68     61       0       0         0
##   endothelial-mural        15      0      0       0       0         0
##   interneurons              0      0      0       0       0         0
##   microglia                 0      0      0       0       0         0
##   oligodendrocytes          0      0      0       0       0         0
##   pyramidal CA1             0      0      0     380     447        49
##   pyramidal SS            109      0      0       0       0         0
##                       level2
## level1                 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
##   astrocytes_ependymal       0      10       0    20    0     0     0
##   endothelial-mural          0       0       0     0    0     0     0
##   interneurons               0       0       0     0   12    21    10
##   microglia                  0       0       0     0    0     0     0
##   oligodendrocytes           0       0       0     0    0     0     0
##   pyramidal CA1             41       0       0     0    0     0     0
##   pyramidal SS               0       0       5     0    0     0     0
##                       level2
## level1                 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
##   astrocytes_ependymal     0     0     0     0     0    0    0    0    0
##   endothelial-mural        0     0     0     0     0    0    0    0    0
##   interneurons            21    15    22    18    20   24   10   15   20
##   microglia                0     0     0     0     0    0    0    0    0
##   oligodendrocytes         0     0     0     0     0    0    0    0    0
##   pyramidal CA1            0     0     0     0     0    0    0    0    0
##   pyramidal SS             0     0     0     0     0    0    0    0    0
##                       level2
## level1                 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
##   astrocytes_ependymal    0    0    0    0    0    0      0      0      0
##   endothelial-mural       0    0    0    0    0    0      0      0      0
##   interneurons           22   23   26   11    0    0      0      0      0
##   microglia               0    0    0    0   17   16      0      0      0
##   oligodendrocytes        0    0    0    0    0    0     45     98     87
##   pyramidal CA1           0    0    0    0    0    0      0      0      0
##   pyramidal SS            0    0    0    0    0    0      0      0      0
##                       level2
## level1                 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
##   astrocytes_ependymal      0      0      0     0    0    0       0
##   endothelial-mural         0      0      0    21    0    0       0
##   interneurons              0      0      0     0    0    0       0
##   microglia                 0      0      0     0   32   33       0
##   oligodendrocytes        106    125    359     0    0    0       0
##   pyramidal CA1             0      0      0     0    0    0       0
##   pyramidal SS              0      0      0     0    0    0      81
##                       level2
## level1                 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
##   astrocytes_ependymal        0       0       0        0       0        0
##   endothelial-mural           0       0       0        0       0        0
##   interneurons                0       0       0        0       0        0
##   microglia                   0       0       0        0       0        0
##   oligodendrocytes            0       0       0        0       0        0
##   pyramidal CA1               0       0       0        0       0        0
##   pyramidal SS               74      26      16       28      39       21
##                       level2
## level1                 SubPyr Vend1 Vend2 Vsmc
##   astrocytes_ependymal      0     0     0    0
##   endothelial-mural         0    32   105   62
##   interneurons              0     0     0    0
##   microglia                 0     0     0    0
##   oligodendrocytes          0     0     0    0
##   pyramidal CA1            22     0     0    0
##   pyramidal SS              0     0     0    0
counts <- as.matrix(data[12:NCOL(data),-(1:2)]) ## why NCOL here, not not NROW?
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NCOL(data),1]
colnames(counts) <- data[8, -(1:2)]

select = level1 %in% c('pyramidal CA1', 'interneurons',
                       'astrocytes_ependymal', 'oligodendrocytes')
counts = counts[, select]
group = droplevels(group[select])
sex = sex[select]
tissue = tissue[select]
level1 = droplevels(level1[select])
level2 = level2[select]
table(tissue)
## tissue
## ca1hippocampus       sscortex 
##           1267           1006
table(tissue, group)
##                 group
## tissue             1   2   3   4   7   8
##   ca1hippocampus 126  20 919 121  64  17
##   sscortex       164   0   0 699 134   9
table(tissue, sex)
##                 sex
## tissue            -1   0   1
##   ca1hippocampus 710  46 511
##   sscortex       444   0 562
table(level1)
## level1
## astrocytes_ependymal         interneurons     oligodendrocytes 
##                  224                  290                  820 
##        pyramidal CA1 
##                  939
table(level1, level2)
##                       level2
## level1                 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
##   astrocytes_ependymal     65     68     61       0       0         0
##   interneurons              0      0      0       0       0         0
##   oligodendrocytes          0      0      0       0       0         0
##   pyramidal CA1             0      0      0     380     447        49
##                       level2
## level1                 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
##   astrocytes_ependymal       0      10       0    20    0     0     0
##   interneurons               0       0       0     0   12    21    10
##   oligodendrocytes           0       0       0     0    0     0     0
##   pyramidal CA1             41       0       0     0    0     0     0
##                       level2
## level1                 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
##   astrocytes_ependymal     0     0     0     0     0    0    0    0    0
##   interneurons            21    15    22    18    20   24   10   15   20
##   oligodendrocytes         0     0     0     0     0    0    0    0    0
##   pyramidal CA1            0     0     0     0     0    0    0    0    0
##                       level2
## level1                 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
##   astrocytes_ependymal    0    0    0    0    0    0      0      0      0
##   interneurons           22   23   26   11    0    0      0      0      0
##   oligodendrocytes        0    0    0    0    0    0     45     98     87
##   pyramidal CA1           0    0    0    0    0    0      0      0      0
##                       level2
## level1                 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
##   astrocytes_ependymal      0      0      0     0    0    0       0
##   interneurons              0      0      0     0    0    0       0
##   oligodendrocytes        106    125    359     0    0    0       0
##   pyramidal CA1             0      0      0     0    0    0       0
##                       level2
## level1                 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
##   astrocytes_ependymal        0       0       0        0       0        0
##   interneurons                0       0       0        0       0        0
##   oligodendrocytes            0       0       0        0       0        0
##   pyramidal CA1               0       0       0        0       0        0
##                       level2
## level1                 SubPyr Vend1 Vend2 Vsmc
##   astrocytes_ependymal      0     0     0    0
##   interneurons              0     0     0    0
##   oligodendrocytes          0     0     0    0
##   pyramidal CA1            22     0     0    0
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)

col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")
colTissue <- col1[tissue]
colMerged <- col2[level1]

To speed up the computations, we will focus on the top 1,000 most variable genes.

par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(0,4), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = 'no filtering')
smoothScatter(mean, rowMeans(core == 0), xlim = c(0,4),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Pink and glia cells have been removed. Fit data with K = 3, V intercepts in both Mu and Pi, commondispersion = FALSE, with X accounting for batch, qc.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 3,
                                  commondispersion = FALSE)))
##     user   system  elapsed 
## 3473.381 1547.494 1999.622

True W

If we simulate W from real data, it will look like that.

pairs(zinb@W, col=colMerged, pch=19, main="W\ncolor = level1")

pairs(zinb@W, col=colTissue, pch=19, main="W\ncolor = tissue")

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.8374696      0.9240038  0.9818697
## gamma_pi       -0.8374696  1.0000000     -0.9695456 -0.8902028
## detection_rate  0.9240038 -0.9695456      1.0000000  0.9508003
## coverage        0.9818697 -0.8902028      0.9508003  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi,
                   tissue = colTissue, level = colMerged)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = level)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

ggplot(gamma, aes(value, fill = tissue)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu_intercept = beta_mu,
                 beta_pi_intercept = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                   beta_mu_intercept beta_pi_intercept
## beta_mu_intercept         1.0000000        -0.6913616
## beta_pi_intercept        -0.6913616         1.0000000
beta = data.frame(beta_mu_intercept = beta_mu, beta_pi_intercept = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1478 rows containing non-finite values (stat_boxplot).

Alpha

pairs(t(zinb@alpha_mu), main = 'alpha_mu')

pairs(t(zinb@alpha_pi), main = 'alpha_pi')

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_mu_3 = zinb@alpha_mu[3, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ],
                 alpha_pi_3 = zinb@alpha_pi[3, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_mu_3  alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.00000000 -0.02687891 -0.02637747 -0.63880851  0.01041533
## alpha_mu_2 -0.02687891  1.00000000  0.05989897  0.11732876 -0.38999690
## alpha_mu_3 -0.02637747  0.05989897  1.00000000  0.20936722 -0.22065224
## alpha_pi_1 -0.63880851  0.11732876  0.20936722  1.00000000 -0.02304856
## alpha_pi_2  0.01041533 -0.38999690 -0.22065224 -0.02304856  1.00000000
## alpha_pi_3  0.26979545 -0.18480626 -0.61527315 -0.30730654  0.03685982
##             alpha_pi_3
## alpha_mu_1  0.26979545
## alpha_mu_2 -0.18480626
## alpha_mu_3 -0.61527315
## alpha_pi_1 -0.30730654
## alpha_pi_2  0.03685982
## alpha_pi_3  1.00000000

Dispersion

Something strange happened for the dispersion.

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.05474489
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion', xlim = c(0,10))
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion', xlim = c(0,10))

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=colMerged, ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=colMerged)

par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(0,4),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(0,4),
              xlab = "Estimated Mean Expression", main = 'Estimated',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
     xlab = "Estimated Mean Expression", xlim = c(0,4),
     ylab = "Estimated Dropout Rate", pch=19,
     ylim = c(0, 1), main = 'Estimated')